In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
!pip install ydata-profiling
Requirement already satisfied: ydata-profiling in c:\users\sylvia.pereira\anaconda3\lib\site-packages (4.1.0)
Requirement already satisfied: matplotlib<3.7,>=3.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (3.5.2)
Requirement already satisfied: imagehash==4.3.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (4.3.1)
Requirement already satisfied: statsmodels<0.14,>=0.13.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.13.2)
Requirement already satisfied: visions[type_image_path]==0.7.5 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.7.5)
Requirement already satisfied: PyYAML<6.1,>=5.0.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (6.0)
Requirement already satisfied: phik<0.13,>=0.11.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.12.3)
Requirement already satisfied: scipy<1.10,>=1.4.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.9.1)
Requirement already satisfied: pydantic<1.11,>=1.8.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.10.6)
Requirement already satisfied: multimethod<1.10,>=1.4 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.9.1)
Requirement already satisfied: tqdm<4.65,>=4.48.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (4.64.1)
Requirement already satisfied: seaborn<0.13,>=0.10.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.11.2)
Requirement already satisfied: numpy<1.24,>=1.16.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.21.5)
Requirement already satisfied: htmlmin==0.1.12 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (0.1.12)
Requirement already satisfied: jinja2<3.2,>=2.11.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (2.11.3)
Requirement already satisfied: typeguard<2.14,>=2.13.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (2.13.3)
Requirement already satisfied: pandas!=1.4.0,<1.6,>1.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (1.4.4)
Requirement already satisfied: requests<2.29,>=2.24.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from ydata-profiling) (2.28.1)
Requirement already satisfied: pillow in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (9.2.0)
Requirement already satisfied: PyWavelets in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (1.3.0)
Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (0.2.0)
Requirement already satisfied: attrs>=19.3.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (21.4.0)
Requirement already satisfied: networkx>=2.4 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (2.8.4)
Requirement already satisfied: MarkupSafe>=0.23 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling) (2.0.1)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (1.4.2)
Requirement already satisfied: cycler>=0.10 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (0.11.0)
Requirement already satisfied: pyparsing>=2.2.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (3.0.9)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (4.25.0)
Requirement already satisfied: packaging>=20.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from matplotlib<3.7,>=3.2->ydata-profiling) (21.3)
Requirement already satisfied: pytz>=2020.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from pandas!=1.4.0,<1.6,>1.1->ydata-profiling) (2022.1)
Requirement already satisfied: joblib>=0.14.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from phik<0.13,>=0.11.1->ydata-profiling) (1.1.0)
Requirement already satisfied: typing-extensions>=4.2.0 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from pydantic<1.11,>=1.8.1->ydata-profiling) (4.3.0)
Requirement already satisfied: idna<4,>=2.5 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling) (3.3)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling) (2022.9.14)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from requests<2.29,>=2.24.0->ydata-profiling) (1.26.11)
Requirement already satisfied: patsy>=0.5.2 in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from statsmodels<0.14,>=0.13.2->ydata-profiling) (0.5.2)
Requirement already satisfied: colorama in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from tqdm<4.65,>=4.48.2->ydata-profiling) (0.4.5)
Requirement already satisfied: six in c:\users\sylvia.pereira\anaconda3\lib\site-packages (from patsy>=0.5.2->statsmodels<0.14,>=0.13.2->ydata-profiling) (1.16.0)
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
df = pd.read_csv(r'C:\Users\sylvia.pereira\OneDrive - alteryx.com\Desktop\Dataset_EDA_Combined_4.csv')
df.head()
Out[3]:
ID_Student course_year_month pass_course dataplus dualpane externalquiz folder forumng glossary homepage ... ouelluminate ouwiki page questionnaire quiz repeatactivity resource sharedsubpage subpage url
0 560374 GGG 2013_October True 0 0 0 0 0 0 51 ... 0 0 0 0 65 0 5 0 8 0
1 519684 EEE 2013_October True 0 2 0 0 525 0 706 ... 0 193 0 0 371 0 77 0 215 60
2 570529 GGG 2013_October True 0 0 0 0 45 2 275 ... 0 0 0 0 158 0 77 0 15 0
3 643653 FFF 2014_October True 0 0 0 0 411 0 470 ... 0 14 5 19 1001 0 20 0 254 37
4 678680 BBB 2014_October True 0 0 0 0 109 4 235 ... 0 0 0 0 118 0 84 0 26 0

5 rows × 23 columns

In [4]:
df.describe()
Out[4]:
ID_Student dataplus dualpane externalquiz folder forumng glossary homepage htmlactivity oucollaborate ... ouelluminate ouwiki page questionnaire quiz repeatactivity resource sharedsubpage subpage url
count 1.907700e+04 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 ... 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000 19077.000000
mean 6.975854e+05 1.815852 0.898779 2.927504 0.266132 319.979032 3.505321 294.471143 0.307019 4.781098 ... 1.762017 40.021754 2.577607 2.504010 317.310321 0.000262 49.663784 0.006395 147.869319 24.287886
std 5.443597e+05 7.233321 3.119043 10.097041 0.980568 636.726445 31.645296 362.405249 1.440269 12.927816 ... 9.147729 96.031117 6.254059 7.306026 551.266264 0.026104 96.579696 0.099595 187.656337 40.288575
min 6.516000e+03 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 5.034910e+05 0.000000 0.000000 0.000000 0.000000 34.000000 0.000000 88.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 0.000000 25.000000 3.000000
50% 5.867620e+05 0.000000 0.000000 0.000000 0.000000 132.000000 0.000000 194.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 95.000000 0.000000 31.000000 0.000000 76.000000 12.000000
75% 6.349280e+05 0.000000 0.000000 0.000000 0.000000 352.000000 0.000000 379.000000 0.000000 4.000000 ... 0.000000 34.000000 2.000000 0.000000 485.000000 0.000000 60.000000 0.000000 212.000000 32.000000
max 2.698577e+06 143.000000 69.000000 340.000000 13.000000 13154.000000 1364.000000 8543.000000 33.000000 316.000000 ... 317.000000 2117.000000 334.000000 65.000000 13032.000000 3.000000 5147.000000 4.000000 4346.000000 2134.000000

8 rows × 21 columns

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19077 entries, 0 to 19076
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ID_Student         19077 non-null  int64 
 1   course_year_month  19077 non-null  object
 2   pass_course        19077 non-null  bool  
 3   dataplus           19077 non-null  int64 
 4   dualpane           19077 non-null  int64 
 5   externalquiz       19077 non-null  int64 
 6   folder             19077 non-null  int64 
 7   forumng            19077 non-null  int64 
 8   glossary           19077 non-null  int64 
 9   homepage           19077 non-null  int64 
 10  htmlactivity       19077 non-null  int64 
 11  oucollaborate      19077 non-null  int64 
 12  oucontent          19077 non-null  int64 
 13  ouelluminate       19077 non-null  int64 
 14  ouwiki             19077 non-null  int64 
 15  page               19077 non-null  int64 
 16  questionnaire      19077 non-null  int64 
 17  quiz               19077 non-null  int64 
 18  repeatactivity     19077 non-null  int64 
 19  resource           19077 non-null  int64 
 20  sharedsubpage      19077 non-null  int64 
 21  subpage            19077 non-null  int64 
 22  url                19077 non-null  int64 
dtypes: bool(1), int64(21), object(1)
memory usage: 3.2+ MB
In [6]:
df.isna().sum()
Out[6]:
ID_Student           0
course_year_month    0
pass_course          0
dataplus             0
dualpane             0
externalquiz         0
folder               0
forumng              0
glossary             0
homepage             0
htmlactivity         0
oucollaborate        0
oucontent            0
ouelluminate         0
ouwiki               0
page                 0
questionnaire        0
quiz                 0
repeatactivity       0
resource             0
sharedsubpage        0
subpage              0
url                  0
dtype: int64
In [7]:
df = df.drop('ID_Student', 1)
In [8]:
print(df['pass_course'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['pass_course'], color='g', bins=100, hist_kws={'alpha': 0.4});
count     19077
unique        2
top        True
freq      12358
Name: pass_course, dtype: object
In [9]:
df.corr()
Out[9]:
pass_course dataplus dualpane externalquiz folder forumng glossary homepage htmlactivity oucollaborate ... ouelluminate ouwiki page questionnaire quiz repeatactivity resource sharedsubpage subpage url
pass_course 1.000000 0.154812 0.100451 0.087266 0.128947 0.203948 0.048854 0.295290 0.054852 0.156591 ... 0.045744 0.174393 0.091318 0.163362 0.210180 -0.005209 0.153820 0.010985 0.245619 0.201365
dataplus 0.154812 1.000000 0.454758 -0.072248 0.264993 0.174257 -0.015104 0.322993 0.282659 0.047325 ... 0.169203 0.041279 0.552618 0.752077 0.316170 -0.002521 0.048457 -0.016120 0.419391 0.230975
dualpane 0.100451 0.454758 1.000000 -0.081658 0.258780 0.151819 -0.022173 0.282362 0.012916 0.005734 ... 0.224996 0.189079 0.502815 0.544677 0.292210 0.004833 0.082508 -0.018504 0.342631 0.221684
externalquiz 0.087266 -0.072248 -0.081658 1.000000 -0.077683 0.149212 0.171191 0.285223 -0.061260 0.300073 ... 0.225364 0.209246 -0.069060 -0.098530 -0.069656 -0.002911 0.223869 -0.018618 0.424413 0.349830
folder 0.128947 0.264993 0.258780 -0.077683 1.000000 0.071099 -0.019212 0.187807 -0.053961 0.039887 ... -0.045589 -0.021863 0.442765 0.328777 0.330940 0.009563 0.054474 -0.017428 0.308711 0.085384
forumng 0.203948 0.174257 0.151819 0.149212 0.071099 1.000000 0.057331 0.731877 0.068599 0.221122 ... 0.125443 0.230668 0.156718 0.156156 0.221318 0.018476 0.191994 0.102713 0.383362 0.418783
glossary 0.048854 -0.015104 -0.022173 0.171191 -0.019212 0.057331 1.000000 0.096357 -0.018317 0.103250 ... 0.038345 0.038040 -0.022357 -0.018049 -0.024958 -0.000160 0.086778 0.000189 0.152831 0.080709
homepage 0.295290 0.322993 0.282362 0.285223 0.187807 0.731877 0.096357 1.000000 0.142335 0.301454 ... 0.230564 0.362058 0.350026 0.336082 0.463731 0.024496 0.332555 0.042075 0.687295 0.633643
htmlactivity 0.054852 0.282659 0.012916 -0.061260 -0.053961 0.068599 -0.018317 0.142335 1.000000 0.077830 ... -0.039765 -0.010692 0.129467 0.285169 0.215290 0.000648 0.002951 -0.013688 0.228946 0.074682
oucollaborate 0.156591 0.047325 0.005734 0.300073 0.039887 0.221122 0.103250 0.301454 0.077830 1.000000 ... -0.062475 0.116357 0.032593 0.058659 0.108873 0.000947 0.219304 -0.020573 0.335250 0.191467
oucontent 0.264943 0.590808 0.506918 -0.082768 0.356134 0.239202 -0.018528 0.533206 0.288287 0.126064 ... 0.168310 0.275083 0.616742 0.672447 0.530083 0.011278 0.155628 -0.040648 0.569371 0.350041
ouelluminate 0.045744 0.169203 0.224996 0.225364 -0.045589 0.125443 0.038345 0.230564 -0.039765 -0.062475 ... 1.000000 0.094605 0.249521 0.199361 0.122839 -0.001934 0.083152 -0.003796 0.310643 0.267894
ouwiki 0.174393 0.041279 0.189079 0.209246 -0.021863 0.230668 0.038040 0.362058 -0.010692 0.116357 ... 0.094605 1.000000 0.056819 0.025240 0.166998 -0.001947 0.160084 -0.026762 0.248239 0.436978
page 0.091318 0.552618 0.502815 -0.069060 0.442765 0.156718 -0.022357 0.350026 0.129467 0.032593 ... 0.249521 0.056819 1.000000 0.643555 0.462047 0.033752 0.106484 -0.026466 0.526778 0.240914
questionnaire 0.163362 0.752077 0.544677 -0.098530 0.328777 0.156156 -0.018049 0.336082 0.285169 0.058659 ... 0.199361 0.025240 0.643555 1.000000 0.383571 0.005354 0.062417 -0.022008 0.458479 0.198547
quiz 0.210180 0.316170 0.292210 -0.069656 0.330940 0.221318 -0.024958 0.463731 0.215290 0.108873 ... 0.122839 0.166998 0.462047 0.383571 1.000000 0.039301 0.209464 -0.024411 0.518357 0.238911
repeatactivity -0.005209 -0.002521 0.004833 -0.002911 0.009563 0.018476 -0.000160 0.024496 0.000648 0.000947 ... -0.001934 -0.001947 0.033752 0.005354 0.039301 1.000000 0.002780 -0.000645 0.023315 0.011293
resource 0.153820 0.048457 0.082508 0.223869 0.054474 0.191994 0.086778 0.332555 0.002951 0.219304 ... 0.083152 0.160084 0.106484 0.062417 0.209464 0.002780 1.000000 0.012213 0.372256 0.253894
sharedsubpage 0.010985 -0.016120 -0.018504 -0.018618 -0.017428 0.102713 0.000189 0.042075 -0.013688 -0.020573 ... -0.003796 -0.026762 -0.026466 -0.022008 -0.024411 -0.000645 0.012213 1.000000 -0.010316 0.024272
subpage 0.245619 0.419391 0.342631 0.424413 0.308711 0.383362 0.152831 0.687295 0.228946 0.335250 ... 0.310643 0.248239 0.526778 0.458479 0.518357 0.023315 0.372256 -0.010316 1.000000 0.544838
url 0.201365 0.230975 0.221684 0.349830 0.085384 0.418783 0.080709 0.633643 0.074682 0.191467 ... 0.267894 0.436978 0.240914 0.198547 0.238911 0.011293 0.253894 0.024272 0.544838 1.000000

21 rows × 21 columns

In [10]:
from ydata_profiling import ProfileReport
In [176]:
profile = ProfileReport(df, title="Profiling Report")
In [177]:
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[177]:

In [178]:
profile.to_file("Output_EDA.html")
Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Drop High Correlated¶

In [11]:
threshold = 0.7
def trimm_correlated(df, threshold):
    df_corr = df.corr(method='pearson', min_periods=1)
    df_not_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any()
    un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
    df_out = df[un_corr_idx]
    return df_out
    print(df_out)
In [12]:
df2 = trimm_correlated(df, 0.7)
In [13]:
cor=df2.corr()
cor
Out[13]:
pass_course dataplus dualpane externalquiz folder forumng glossary htmlactivity oucollaborate oucontent ouelluminate ouwiki page quiz repeatactivity resource sharedsubpage subpage url
pass_course 1.000000 0.154812 0.100451 0.087266 0.128947 0.203948 0.048854 0.054852 0.156591 0.264943 0.045744 0.174393 0.091318 0.210180 -0.005209 0.153820 0.010985 0.245619 0.201365
dataplus 0.154812 1.000000 0.454758 -0.072248 0.264993 0.174257 -0.015104 0.282659 0.047325 0.590808 0.169203 0.041279 0.552618 0.316170 -0.002521 0.048457 -0.016120 0.419391 0.230975
dualpane 0.100451 0.454758 1.000000 -0.081658 0.258780 0.151819 -0.022173 0.012916 0.005734 0.506918 0.224996 0.189079 0.502815 0.292210 0.004833 0.082508 -0.018504 0.342631 0.221684
externalquiz 0.087266 -0.072248 -0.081658 1.000000 -0.077683 0.149212 0.171191 -0.061260 0.300073 -0.082768 0.225364 0.209246 -0.069060 -0.069656 -0.002911 0.223869 -0.018618 0.424413 0.349830
folder 0.128947 0.264993 0.258780 -0.077683 1.000000 0.071099 -0.019212 -0.053961 0.039887 0.356134 -0.045589 -0.021863 0.442765 0.330940 0.009563 0.054474 -0.017428 0.308711 0.085384
forumng 0.203948 0.174257 0.151819 0.149212 0.071099 1.000000 0.057331 0.068599 0.221122 0.239202 0.125443 0.230668 0.156718 0.221318 0.018476 0.191994 0.102713 0.383362 0.418783
glossary 0.048854 -0.015104 -0.022173 0.171191 -0.019212 0.057331 1.000000 -0.018317 0.103250 -0.018528 0.038345 0.038040 -0.022357 -0.024958 -0.000160 0.086778 0.000189 0.152831 0.080709
htmlactivity 0.054852 0.282659 0.012916 -0.061260 -0.053961 0.068599 -0.018317 1.000000 0.077830 0.288287 -0.039765 -0.010692 0.129467 0.215290 0.000648 0.002951 -0.013688 0.228946 0.074682
oucollaborate 0.156591 0.047325 0.005734 0.300073 0.039887 0.221122 0.103250 0.077830 1.000000 0.126064 -0.062475 0.116357 0.032593 0.108873 0.000947 0.219304 -0.020573 0.335250 0.191467
oucontent 0.264943 0.590808 0.506918 -0.082768 0.356134 0.239202 -0.018528 0.288287 0.126064 1.000000 0.168310 0.275083 0.616742 0.530083 0.011278 0.155628 -0.040648 0.569371 0.350041
ouelluminate 0.045744 0.169203 0.224996 0.225364 -0.045589 0.125443 0.038345 -0.039765 -0.062475 0.168310 1.000000 0.094605 0.249521 0.122839 -0.001934 0.083152 -0.003796 0.310643 0.267894
ouwiki 0.174393 0.041279 0.189079 0.209246 -0.021863 0.230668 0.038040 -0.010692 0.116357 0.275083 0.094605 1.000000 0.056819 0.166998 -0.001947 0.160084 -0.026762 0.248239 0.436978
page 0.091318 0.552618 0.502815 -0.069060 0.442765 0.156718 -0.022357 0.129467 0.032593 0.616742 0.249521 0.056819 1.000000 0.462047 0.033752 0.106484 -0.026466 0.526778 0.240914
quiz 0.210180 0.316170 0.292210 -0.069656 0.330940 0.221318 -0.024958 0.215290 0.108873 0.530083 0.122839 0.166998 0.462047 1.000000 0.039301 0.209464 -0.024411 0.518357 0.238911
repeatactivity -0.005209 -0.002521 0.004833 -0.002911 0.009563 0.018476 -0.000160 0.000648 0.000947 0.011278 -0.001934 -0.001947 0.033752 0.039301 1.000000 0.002780 -0.000645 0.023315 0.011293
resource 0.153820 0.048457 0.082508 0.223869 0.054474 0.191994 0.086778 0.002951 0.219304 0.155628 0.083152 0.160084 0.106484 0.209464 0.002780 1.000000 0.012213 0.372256 0.253894
sharedsubpage 0.010985 -0.016120 -0.018504 -0.018618 -0.017428 0.102713 0.000189 -0.013688 -0.020573 -0.040648 -0.003796 -0.026762 -0.026466 -0.024411 -0.000645 0.012213 1.000000 -0.010316 0.024272
subpage 0.245619 0.419391 0.342631 0.424413 0.308711 0.383362 0.152831 0.228946 0.335250 0.569371 0.310643 0.248239 0.526778 0.518357 0.023315 0.372256 -0.010316 1.000000 0.544838
url 0.201365 0.230975 0.221684 0.349830 0.085384 0.418783 0.080709 0.074682 0.191467 0.350041 0.267894 0.436978 0.240914 0.238911 0.011293 0.253894 0.024272 0.544838 1.000000
In [14]:
df2.head()
Out[14]:
pass_course dataplus dualpane externalquiz folder forumng glossary htmlactivity oucollaborate oucontent ouelluminate ouwiki page quiz repeatactivity resource sharedsubpage subpage url
0 True 0 0 0 0 0 0 0 0 50 0 0 0 65 0 5 0 8 0
1 True 0 2 0 0 525 0 0 0 1788 0 193 0 371 0 77 0 215 60
2 True 0 0 0 0 45 2 0 0 628 0 0 0 158 0 77 0 15 0
3 True 0 0 0 0 411 0 3 9 964 0 14 5 1001 0 20 0 254 37
4 True 0 0 0 0 109 4 0 26 478 0 0 0 118 0 84 0 26 0
In [15]:
df2.to_csv(r'C:\Users\sylvia.pereira\OneDrive - alteryx.com\Desktop\Dataset_Non_Correlated.csv', index=False)
In [16]:
X = df2.drop('pass_course', 1)
y = df2['pass_course']
X.shape
Out[16]:
(19077, 18)
In [17]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
import sklearn.feature_selection as fs
from numpy import array

Feature Selection¶

Filter Method Mutual Information¶

In [18]:
# this function will take in X, y variables 
# with criteria, and return a dataframe
# with most important columns
# based on that criteria
def featureSelect_dataframe(X, y, criteria, k):

    # initialize our function/method
    reg = SelectKBest(criteria, k=k).fit(X,y)
    
    # transform after creating the reg (so we can use getsupport)
    X_transformed = reg.transform(X)

    # filter down X based on kept columns
    X = X[[val for i,val in enumerate(X.columns) if reg.get_support()[i]]]

    # return that dataframe
    return X

New_X = featureSelect_dataframe(X, y, mutual_info_classif, 5)
print('Reduced features:')
New_X.head()
Reduced features:
Out[18]:
forumng oucontent quiz resource subpage
0 0 50 65 5 8
1 525 1788 371 77 215
2 45 628 158 77 15
3 411 964 1001 20 254
4 109 478 118 84 26
In [103]:
mutual_info = mutual_info_classif(X,y)
mutual_info
Out[103]:
array([0.03092671, 0.00753573, 0.01313172, 0.01284323, 0.07416067,
       0.01392097, 0.00069133, 0.02270708, 0.06480582, 0.00309643,
       0.0266207 , 0.01126403, 0.08431983, 0.        , 0.08281785,
       0.        , 0.06610518, 0.04869091])
In [20]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending = False)
Out[20]:
quiz              0.089455
resource          0.087706
forumng           0.073942
oucontent         0.068503
subpage           0.062333
url               0.054060
dataplus          0.027015
ouwiki            0.025741
oucollaborate     0.018504
folder            0.015636
externalquiz      0.013252
glossary          0.012124
dualpane          0.010155
page              0.007011
ouelluminate      0.002484
repeatactivity    0.001636
sharedsubpage     0.001521
htmlactivity      0.000000
dtype: float64
In [21]:
mutual_info.sort_values(ascending = False).plot.bar(figsize=(20,8))
Out[21]:
<AxesSubplot:>
In [22]:
sel_ten = SelectKBest(mutual_info_classif, k=5)
sel_ten.fit(X.fillna(0), y)
X.columns[sel_ten.get_support()]
Out[22]:
Index(['forumng', 'oucontent', 'quiz', 'resource', 'subpage'], dtype='object')
In [23]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

cor = New_X.corr()
plt.figure(figsize = (10,6))
sns.heatmap(cor, annot = True)
Out[23]:
<AxesSubplot:>

Embbeded Method¶

In [18]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

Old Lasso¶

In [19]:
from sklearn.linear_model import LassoCV
reg = LassoCV()
reg.fit(X,y)
print('Besy alpha using built-in LassoCV: %f' % reg.alpha_)
print('Besy score using built-in LassoCV: %f' % reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)
Besy alpha using built-in LassoCV: 0.091697
Besy score using built-in LassoCV: 0.123735
In [20]:
print ('Lasso picked ' + str(sum(coef != 0)) + ' variables and eliminated the other ' + str(sum(coef == 0)) + ' variables.')
Lasso picked 11 variables and eliminated the other 7 variables.
In [21]:
importance = np.abs(coef)
importance
Out[21]:
dataplus          0.000000
dualpane          0.000000
externalquiz      0.000354
folder            0.000000
forumng           0.000071
glossary          0.000323
htmlactivity      0.000000
oucollaborate     0.001985
oucontent         0.000131
ouelluminate      0.000000
ouwiki            0.000270
page              0.006090
quiz              0.000065
repeatactivity    0.000000
resource          0.000275
sharedsubpage     0.000000
subpage           0.000072
url               0.000245
dtype: float64
In [22]:
np.array(importance)[importance > 0]
Out[22]:
array([3.53847966e-04, 7.13504852e-05, 3.23436899e-04, 1.98472367e-03,
       1.30918872e-04, 2.69649328e-04, 6.08976163e-03, 6.52754519e-05,
       2.75313654e-04, 7.15132332e-05, 2.45476373e-04])
In [23]:
df2
df3 = df2.copy()
In [24]:
df3.head()
Out[24]:
pass_course dataplus dualpane externalquiz folder forumng glossary htmlactivity oucollaborate oucontent ouelluminate ouwiki page quiz repeatactivity resource sharedsubpage subpage url
0 True 0 0 0 0 0 0 0 0 50 0 0 0 65 0 5 0 8 0
1 True 0 2 0 0 525 0 0 0 1788 0 193 0 371 0 77 0 215 60
2 True 0 0 0 0 45 2 0 0 628 0 0 0 158 0 77 0 15 0
3 True 0 0 0 0 411 0 3 9 964 0 14 5 1001 0 20 0 254 37
4 True 0 0 0 0 109 4 0 26 478 0 0 0 118 0 84 0 26 0
In [25]:
df4 = df3.drop(['pass_course', 'dataplus', 'dualpane', 'folder', 'htmlactivity', 'ouelluminate', 'repeatactivity', 'sharedsubpage'], axis=1)
In [26]:
df4.head()
Out[26]:
externalquiz forumng glossary oucollaborate oucontent ouwiki page quiz resource subpage url
0 0 0 0 0 50 0 0 65 5 8 0
1 0 525 0 0 1788 193 0 371 77 215 60
2 0 45 2 0 628 0 0 158 77 15 0
3 0 411 0 9 964 14 5 1001 20 254 37
4 0 109 4 26 478 0 0 118 84 26 0
In [27]:
imp_coef = coef.sort_values()
import matplotlib
In [28]:
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = 'barh')
plt.title ('Feature importance using Lasso Model')
Out[28]:
Text(0.5, 1.0, 'Feature importance using Lasso Model')
In [29]:
df_final = df4
df_final.shape
Out[29]:
(19077, 11)
In [30]:
# Select Features
feature = df_final

# Select Target
target = y

# Set Training and Testing Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature , target, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=1)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)
Shape of training feature: (15261, 11)
Shape of testing feature: (3816, 11)
Shape of training label: (15261,)
Shape of training label: (3816,)
In [31]:
def evaluate_model(model, X_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(X_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(X_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}

Building Models¶

Decision Tree Model - With Feature Selection¶

In [35]:
from sklearn import tree

# Building Decision Tree model 
dtc = tree.DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, y_train)
Out[35]:
DecisionTreeClassifier(random_state=0)
In [36]:
# Evaluate Model
dtc_eval = evaluate_model(dtc, X_test, y_test)
In [37]:
# Evaluate Model
dtc_eval = evaluate_model(dtc, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Cohens Kappa Score:', dtc_eval['kappa'])
print('Area Under Curve:', dtc_eval['auc'])
print('Confusion Matrix:\n', dtc_eval['cm'])
Accuracy: 0.7465932914046122
Precision: 0.8126540673788003
Recall: 0.7946966653274408
F1 Score: 0.8035750558602478
Cohens Kappa Score: 0.44674226009023477
Area Under Curve: 0.728914685051302
Confusion Matrix:
 [[ 871  456]
 [ 511 1978]]

Random Forest - With Feature Selection¶

In [38]:
from sklearn.ensemble import RandomForestClassifier

# Building Random Forest model 
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
Out[38]:
RandomForestClassifier(random_state=0)
In [39]:
# Evaluate Model
rf_eval = evaluate_model(rf, X_test, y_test)

# Print result
print('Accuracy:', rf_eval['acc'])
print('Precision:', rf_eval['prec'])
print('Recall:', rf_eval['rec'])
print('F1 Score:', rf_eval['f1'])
print('Cohens Kappa Score:', rf_eval['kappa'])
print('Area Under Curve:', rf_eval['auc'])
print('Confusion Matrix:\n', rf_eval['cm'])
Accuracy: 0.8223270440251572
Precision: 0.8289139120958954
Recall: 0.9168340699075934
F1 Score: 0.8706600534147271
Cohens Kappa Score: 0.5892629953970749
Area Under Curve: 0.8677649631248633
Confusion Matrix:
 [[ 856  471]
 [ 207 2282]]

Naive Bayes - With Feature Selection¶

In [40]:
from sklearn.naive_bayes import GaussianNB

# Building Naive Bayes model 
nb = GaussianNB()
nb.fit(X_train, y_train)
Out[40]:
GaussianNB()
In [41]:
# Evaluate Model
nb_eval = evaluate_model(nb, X_test, y_test)

# Print result
print('Accuracy:', nb_eval['acc'])
print('Precision:', nb_eval['prec'])
print('Recall:', nb_eval['rec'])
print('F1 Score:', nb_eval['f1'])
print('Cohens Kappa Score:', nb_eval['kappa'])
print('Area Under Curve:', nb_eval['auc'])
print('Confusion Matrix:\n', nb_eval['cm'])
Accuracy: 0.5445492662473794
Precision: 0.8539114043355325
Recall: 0.3640016070711129
F1 Score: 0.5104225352112676
Cohens Kappa Score: 0.19756877740273548
Area Under Curve: 0.7351959775990999
Confusion Matrix:
 [[1172  155]
 [1583  906]]

K-Nearest Neighbors With Feature Selection¶

In [42]:
from sklearn.neighbors import KNeighborsClassifier

# Building KNN model 
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
Out[42]:
KNeighborsClassifier()
In [43]:
# Evaluate Model
knn_eval = evaluate_model(knn, X_test, y_test)

# Print result
print('Accuracy:', knn_eval['acc'])
print('Precision:', knn_eval['prec'])
print('Recall:', knn_eval['rec'])
print('F1 Score:', knn_eval['f1'])
print('Cohens Kappa Score:', knn_eval['kappa'])
print('Area Under Curve:', knn_eval['auc'])
print('Confusion Matrix:\n', knn_eval['cm'])
Accuracy: 0.7887840670859538
Precision: 0.8052230685527747
Recall: 0.8919244676576938
F1 Score: 0.846359130766298
Cohens Kappa Score: 0.5113591626829606
Area Under Curve: 0.806533222440986
Confusion Matrix:
 [[ 790  537]
 [ 269 2220]]
In [ ]: